Author: Amanda Everitt
Began: 8/20/2018
Finished:: 8/22/2018

[Packages Used]

[Results]

Genes
Initial 27998
-13065 Removed genes that didn’t occur in >17 cells (0.1% of population)
Final 14933
Cells
Initial 17823
-163 Removed cells with nUMI <500 or >100000
-182 Removed cells with nGene <500 or >3000
-82 Removed cells with a mitochondrial percent >30%
Final 17396

1. Filter Data

Load data

dataset_loc <- "/Users/AEveritt/projects/scRNAseq_L5_Tbr1/raw_data/mm10/"
ids <- c("L5HET/", "L5NULL/","L5WT/")

d10x.data <- sapply(ids, function(i){
  d10x <- Seurat::Read10X(file.path(dataset_loc,i))
  colnames(d10x) <- paste(sapply(strsplit(colnames(d10x),split="-"),'[[',1L),i,sep="-")
  d10x
})

experiment.data <- do.call("cbind", d10x.data)

experiment.aggregate <- CreateSeuratObject(
  experiment.data,
  project = "Siavash scRNA", 
  names.field = 2,
  min.cells = 17,
  names.delim = "\\-")
experiment.aggregate 
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 17823 samples.

Calculate proportion of mitochondrial, ribosomal, and pseudo genes in each cell

mito.genes <- grep("^mt-", rownames(experiment.aggregate@data), value = T)
percent.mito <- Matrix::colSums(experiment.aggregate@raw.data[mito.genes, ]) / Matrix::colSums(experiment.aggregate@raw.data)
experiment.aggregate <- AddMetaData(
  object = experiment.aggregate,
  metadata = percent.mito,
  col.name= "percent.mito")

ribo.genes <- grep("^Rps|^Rpl|^Mrps|^Mrpl|^Gm", rownames(experiment.aggregate@data), value = T)
percent.ribo <- Matrix::colSums(experiment.aggregate@raw.data[ribo.genes, ]) / Matrix::colSums(experiment.aggregate@raw.data)
experiment.aggregate <- AddMetaData(
  object = experiment.aggregate,
  metadata = percent.ribo,
  col.name= "percent.ribo")

pseudo.genes <- grep("^Gm", rownames(experiment.aggregate@data), value = T)
percent.pseudo <- Matrix::colSums(experiment.aggregate@raw.data[pseudo.genes, ]) / Matrix::colSums(experiment.aggregate@raw.data)
experiment.aggregate <- AddMetaData(
  object = experiment.aggregate,
  metadata = percent.pseudo,
  col.name= "percent.pseudo")

Filter based on nUMI, nGene, %mito (really fraction)

p1a <- ggplot(experiment.aggregate@meta.data, aes(x=nUMI)) + 
  geom_histogram(binwidth=50, color="black", fill="white") + ggtitle("Distribution of nUMI") +
  geom_vline(aes(xintercept=500),color="red") + geom_vline(aes(xintercept=10000),color="red") + theme_bw()
p1b <- VlnPlot(experiment.aggregate, "nUMI", do.return = TRUE) + geom_hline(yintercept = 10000, color= "red")
p1b <- p1b + ggtitle("Distribution of nUMI by genotype") + xlab("Genotype") + ylab("Count") + theme_bw() + 
  theme(plot.title = element_text(color="black", size=12),
        axis.title.x = element_text(color="black", size=12),
        axis.title.y = element_text(color="black", size=12),
        legend.position = "none")

p2a <- ggplot(experiment.aggregate@meta.data, aes(x=nGene)) + 
  geom_histogram(binwidth=50, color="black", fill="white") + ggtitle("Distribution of nGene") +
  geom_vline(aes(xintercept=500),color="red") + geom_vline(aes(xintercept=3000),color="red") + theme_bw()
p2b <- VlnPlot(experiment.aggregate, "nGene", do.return = TRUE) + geom_hline(yintercept = 3000, color= "red")
p2b <- p2b + ggtitle("Distribution of nGene by genotype") + xlab("Genotype") + ylab("Count") + theme_bw() + 
  theme(plot.title = element_text(color="black", size=12),
        axis.title.x = element_text(color="black", size=12),
        axis.title.y = element_text(color="black", size=12),
        legend.position = "none")

p3a <- ggplot(experiment.aggregate@meta.data, aes(x=percent.mito)) + 
  geom_histogram(binwidth=0.01, color="black", fill="white") + ggtitle("Distribution of mitochondrial percentage") +
  geom_vline(aes(xintercept=0.3),color="red") + theme_bw()
p3b <- VlnPlot(experiment.aggregate, "percent.mito", do.return = TRUE) + geom_hline(yintercept = 0.3, color= "red")
p3b <- p3b + ggtitle("Distribution of mito % by genotype") + xlab("Genotype") + ylab("Count") + theme_bw() + 
  theme(plot.title = element_text(color="black", size=12),
        axis.title.x = element_text(color="black", size=12),
        axis.title.y = element_text(color="black", size=12),
        legend.position = "none")

grid.arrange(p1a,p2a, p3a, p1b, p2b, p3b, ncol=3)

experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "nUMI", 
                                    low.thresholds = 500 , high.thresholds = 10000)
experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "nGene", 
                                    low.thresholds = 500 , high.thresholds = 3000)
experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "percent.mito", 
                                    low.thresholds = -Inf , high.thresholds = 0.3)
Note: ribo and pseudo genes had a seemingly normal distribution so they were not used for filtering (among other biological reasons)
#Save Data 
save(experiment.aggregate, file=paste0(out_dir,"/experiment.aggregate.filtered.Rdata"))

#Clean up workspace
#detach("package:Seurat", unload=TRUE)
#detach("package:cowplot", unload=TRUE)
#gc()
#R.utils::gcDLLs() 

2. Detect any outliers based on QC metrics

core = SingleCellExperiment(assays = list(counts = as.matrix(experiment.aggregate@data)),
                            colData = as.data.frame(experiment.aggregate@meta.data), 
                            rowData = rownames(as.matrix(experiment.aggregate@data)))
core <- calculateQCMetrics(core)
core <- normalize(core)
## Warning in .local(object, ...): using library sizes as size factors
scater::plotExprsFreqVsMean(core)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

PCA based on expression

pca_exp <- scater::runPCA(core, use_coldata = F, detect_outliers = TRUE)
plotReducedDim(pca_exp, use_dimred="PCA", colour_by="orig.ident")

summary(pca_exp$outlier)
## Length  Class   Mode 
##      0   NULL   NULL

PCA based on metadata

pca_coldata <- scater::runPCA(core, use_coldata = T,  detect_outliers = TRUE,
                              selected_variables = c("nGene", "pct_counts_in_top_500_features", "percent.mito","nUMI"))
summary(pca_coldata$outlier)
##    Mode   FALSE    TRUE 
## logical   16702     694
plotReducedDim(pca_coldata, use_dimred="PCA_coldata", colour_by="orig.ident")

plotReducedDim(pca_coldata, use_dimred="PCA_coldata", colour_by="outlier")

possible_outliers <- rownames(pca_coldata@colData[pca_coldata$outlier == T,])
save(possible_outliers, file=paste0(out_dir,"/possible_outliers.Rdata"))
#scater::plotExplanatoryVariables(core)  #takes too long to plot for knitr and im impatient
knitr::include_graphics(paste0(out_dir,'/expl_plot.jpeg'))

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.6_1/lib/libopenblasp-r0.3.6.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3               mvoutlier_2.0.9            
##  [3] sgeostat_1.0-27             scater_1.12.2              
##  [5] SingleCellExperiment_1.6.0  SummarizedExperiment_1.14.1
##  [7] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [9] matrixStats_0.55.0          Biobase_2.44.0             
## [11] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [13] IRanges_2.18.3              S4Vectors_0.22.1           
## [15] BiocGenerics_0.30.0         Seurat_2.3.4               
## [17] Matrix_1.2-17               cowplot_1.0.0              
## [19] ggplot2_3.2.1               knitr_1.25                 
## 
## loaded via a namespace (and not attached):
##   [1] reticulate_1.13          R.utils_2.9.0           
##   [3] tidyselect_0.2.5         htmlwidgets_1.5.1       
##   [5] grid_3.6.0               ranger_0.11.2           
##   [7] Rtsne_0.15               munsell_0.5.0           
##   [9] codetools_0.2-16         ica_1.0-2               
##  [11] sROC_0.1-2               withr_2.1.2             
##  [13] colorspace_1.4-1         rstudioapi_0.10         
##  [15] ROCR_1.0-7               robustbase_0.93-5       
##  [17] dtw_1.21-3               vcd_1.4-4               
##  [19] VIM_4.8.0                gbRd_0.4-11             
##  [21] labeling_0.3             Rdpack_0.11-0           
##  [23] lars_1.2                 GenomeInfoDbData_1.2.1  
##  [25] cvTools_0.3.2            bit64_0.9-7             
##  [27] vctrs_0.2.0              xfun_0.10               
##  [29] diptest_0.75-7           R6_2.4.0                
##  [31] ggbeeswarm_0.6.0         robCompositions_2.1.0   
##  [33] rsvd_1.0.2               hdf5r_1.3.0             
##  [35] flexmix_2.3-15           bitops_1.0-6            
##  [37] reshape_0.8.8            assertthat_0.2.1        
##  [39] SDMTools_1.1-221.1       scales_1.0.0            
##  [41] nnet_7.3-12              beeswarm_0.2.3          
##  [43] gtable_0.3.0             npsurv_0.4-0            
##  [45] rlang_0.4.0              zeallot_0.1.0           
##  [47] splines_3.6.0            lazyeval_0.2.2          
##  [49] acepack_1.4.1            checkmate_1.9.4         
##  [51] yaml_2.2.0               reshape2_1.4.3          
##  [53] abind_1.4-5              backports_1.1.5         
##  [55] Hmisc_4.2-0              tools_3.6.0             
##  [57] zCompositions_1.3.2-1    gplots_3.0.1.1          
##  [59] RColorBrewer_1.1-2       proxy_0.4-23            
##  [61] ggridges_0.5.1           Rcpp_1.0.2              
##  [63] plyr_1.8.4               base64enc_0.1-3         
##  [65] zlibbioc_1.30.0          purrr_0.3.2             
##  [67] RCurl_1.95-4.12          rpart_4.1-15            
##  [69] pbapply_1.4-2            viridis_0.5.1           
##  [71] zoo_1.8-6                haven_2.1.1             
##  [73] cluster_2.1.0            magrittr_1.5            
##  [75] data.table_1.12.4        openxlsx_4.1.0.1        
##  [77] lmtest_0.9-37            RANN_2.6.1              
##  [79] truncnorm_1.0-8          mvtnorm_1.0-11          
##  [81] fitdistrplus_1.0-14      hms_0.5.1               
##  [83] lsei_1.2-0               evaluate_0.14           
##  [85] rio_0.5.16               mclust_5.4.5            
##  [87] readxl_1.3.1             compiler_3.6.0          
##  [89] tibble_2.1.3             KernSmooth_2.23-15      
##  [91] crayon_1.3.4             R.oo_1.22.0             
##  [93] htmltools_0.4.0          mgcv_1.8-29             
##  [95] segmented_1.0-0          pcaPP_1.9-73            
##  [97] Formula_1.2-3            snow_0.4-3              
##  [99] tidyr_1.0.0              rrcov_1.4-7             
## [101] MASS_7.3-51.4            fpc_2.2-3               
## [103] boot_1.3-23              car_3.0-3               
## [105] R.methodsS3_1.7.1        gdata_2.18.0            
## [107] metap_1.1                igraph_1.2.4.1          
## [109] forcats_0.4.0            pkgconfig_2.0.3         
## [111] foreign_0.8-72           laeken_0.5.0            
## [113] sp_1.3-1                 foreach_1.4.7           
## [115] vipor_0.4.5              XVector_0.24.0          
## [117] bibtex_0.4.2             NADA_1.6-1              
## [119] stringr_1.4.0            digest_0.6.21           
## [121] pls_2.7-2                tsne_0.1-3              
## [123] rmarkdown_1.16           cellranger_1.1.0        
## [125] htmlTable_1.13.2         DelayedMatrixStats_1.6.1
## [127] curl_4.2                 kernlab_0.9-27          
## [129] gtools_3.8.1             modeltools_0.2-22       
## [131] lifecycle_0.1.0          nlme_3.1-141            
## [133] jsonlite_1.6             carData_3.0-2           
## [135] BiocNeighbors_1.2.0      viridisLite_0.3.0       
## [137] pillar_1.4.2             lattice_0.20-38         
## [139] GGally_1.4.0             httr_1.4.1              
## [141] DEoptimR_1.0-8           survival_2.44-1.1       
## [143] glue_1.3.1               zip_2.0.4               
## [145] png_0.1-7                prabclus_2.3-1          
## [147] iterators_1.0.12         bit_1.1-14              
## [149] class_7.3-15             stringi_1.4.3           
## [151] mixtools_1.1.0           BiocSingular_1.0.0      
## [153] doSNOW_1.0.18            latticeExtra_0.6-28     
## [155] caTools_1.17.1.2         dplyr_0.8.3             
## [157] irlba_2.3.3              e1071_1.7-2             
## [159] ape_5.3